# Install packages
install.packages("writexl")
install.packages("readxl")
install.packages("tidyverse")

# Load packages
library(writexl)
library(readxl)
library(tidyverse)

#Read in raw data
data <- read.csv("Data_raw/Dataset_qualtrics_raw.csv", fileEncoding = "Latin1")

#Rename variable names
data <- rename(data, FTND1 = Nicotine.Dependence., FTND2 = Q17, FTND3 = Q18, FTND4 = Q19, FTND5 = Q20, FTND6 = Q21)
data <- rename(data, participant_ID = Participant.ID)
data <- rename(data, duration = Duration..in.seconds.)
data <- rename(data, smoking_frequency = Smoking.frequency)
data <- rename(data, Gender_other = Gender_5_TEXT)
data <- rename(data, ethnicity_other_white = Ethnicity_4_TEXT)
data <- rename(data, ethnicity_other_black = Ethnicity_8_TEXT)
data <- rename(data, ethnicity_other_asian = Ethnicity_14_TEXT)
data <- rename(data, ethnicity_other_mixed = Ethnicity_18_TEXT)
data <- rename(data, ethnicity_any_other = Ethnicity_20_TEXT)
data <- rename(data, no_prev_quits = No..previous.quits)
data <- rename(data, SHAPS1 = Anhedonia, SHAPS2 = Anhedonia.1, SHAPS3 = Anhedonia.2, SHAPS4 = Anhedonia.3, SHAPS5 = Anhedonia.4, SHAPS6 = Anhedonia.5, SHAPS7 = Anhedonia.6, SHAPS8 = Anhedonia.7, SHAPS9 = Anhedonia.8, SHAPS10 = Anhedonia.9, SHAPS11 = Anhedonia.10, SHAPS12 = Anhedonia.11, SHAPS13 = Anhedonia.12, SHAPS14 = Anhedonia.13)
data <- rename(data, pre_motivation_to_stop = Motivation.to.Stop)
data <- rename(data, pre_plan_to_quit = Intention.to.quit_1)
data <- rename(data, pre_quit_confidence = Quitting.efficacy, Pre_cutdown_ease = Quitting.efficacy.)
data <- rename(data, SB_relax = Smoking.beliefs._1, SB_stress = Smoking.beliefs._2, SB_weight = Smoking.beliefs._3, SB_comfort = Smoking.beliefs._4, SB_popular = Smoking.beliefs._5, SB_no_harm = Smoking.beliefs._6)
data <- rename(data, PHQ9_1 = Q6_1, PHQ9_2 = Q6_2, PHQ9_3 = Q6_3, PHQ9_4 = Q6_4, PHQ9_5 = Q6_5, PHQ9_6 = Q6_6, PHQ9_7 = Q6_7, PHQ9_8 = Q6_8, PHQ9_9 = Q6_9)
data <- rename(data, GAD7_1 = Q7_1, GAD7_2 = Q7_2, GAD7_3 = Q7_3, GAD7_4 = Q7_4, GAD7_5 = Q7_5, GAD7_6 = Q7_6, GAD7_7 = Q7_7)
data <- rename(data, post_motivation_to_stop = Q107)
data <- rename(data, post_plan_to_stop = Q108_1)
data <- rename(data, post_confidence_to_stop = Q109)
data <- rename(data, post_cut_down_ease = Q110)
data <- rename(data, post_SB_relax = Q113_1, post_SB_stress = Q113_2, post_SB_weight = Q113_3, post_SB_comfort = Q113_4, post_SB_popular = Q113_5, post_SB_no_harm = Q113_6)
data <- rename(data, worth_remembering = Attention_1, grabbed_attention = Q45_1)
data <- rename(data, pleasant = Pleasant._1, arousal = Arousal_1, dominance = Dominance_1)
data <- rename(data, belief = Q120_1)
data <- rename(data, PHQ9_total = PHQ9, GAD7_total = GAD7)
data <- rename(data, condition = Combined.condition.column)

#coding ethnicity numbers to characters 
data$ethnicity_coded <- recode (data$Ethnicity, "1" = "White_British", "2" = "Irish", "3" = "Gypsy_Traveller", "4" = "other_white", "5" = "Black_British", "6" = "African", "7" = "Carribbean", "8" = "Other_black", "9" = "Asian_British", "10" = "Indian", "11" = "Pakistani", "12" = "Bangladeshi", "13" = "Chinese", "14" = "other_asian", "15" = "white_black_ caribbean", "16" = "white_black_african", "17" = "white_asian", "18" = "other_mixed", "19" = "arab", "20" = "any_other", "21" = "prefer_not_say")

#coding education numbers to characters 
data$education <- recode (data$Education, "1" = "none", "2" = "GCSE", "3" = "a_levels", "4" = "Undergrad", "5" = "postgrad", "6" = "prefer_not_say")

#coding gender numbers to characters
data$gender <- recode (data$Gender, "1" = "Man", "2" = "Woman", "3" = "non_binary", "4" = "prefer_not_say", "5" = "other")

#coding residence numbers to characters
data$residence <- recode (data$Residence, "1" = "England", "2" = "Wales", "3" = "Scotland", "4" = "NI", "5" = "Not_UK")

#SHAPS summed
data$SHAPS_total <- data$SHAPS1 + data$SHAPS2 + data$SHAPS3 + data$SHAPS4 + data$SHAPS5 + data$SHAPS6 + data$SHAPS7 + data$SHAPS8 + data$SHAPS9 + data$SHAPS10 + data$SHAPS11 + data$SHAPS12 + data$SHAPS13 + data$SHAPS14

# recoding FTND values
data$FTND1 <- recode (data$FTND1, "1" = 0, "2" = 1, "3" = 2, "4" = 3)
data$FTND2 <- recode (data$FTND2, "1" = 1, "2" = 0)
data$FTND3 <- recode (data$FTND3, "1" = 1, "2" = 0)
data$FTND4 <- recode (data$FTND4, "1" = 0, "2" = 1, "3" = 2, "4" = 3)
data$FTND5 <- recode (data$FTND5, "1" = 0, "2" = 1)
data$FTND6 <- recode (data$FTND6, "1" = 0, "2" = 1)

#FTND summed
data$FTND_total <- data$FTND1 + data$FTND2 + data$FTND3 + data$FTND4 + data$FTND5 + data$FTND6

#Remove unnecessary columns
data = subset(data, select = -c(Qualtrics.Age, Residence, PHQ9_1, PHQ9_2, PHQ9_3, PHQ9_4, PHQ9_5, PHQ9_6, PHQ9_7, PHQ9_8, PHQ9_9, GAD7_1, GAD7_2, GAD7_2, GAD7_2, GAD7_3, GAD7_4, GAD7_5, GAD7_6, GAD7_7, MH.MH_First.Click, MH.MH_Last.Click, MH.MH_Page.Submit, MH.MH_Click.Count, MH.MH_First.Click.1, MH.MH_Last.Click.1, MH.MH_Page.Submit.1, MH.MH_Click.Count.1, MH.MH_First.Click.2, MH.MH_Last.Click.2, MH.MH_Page.Submit.2, MH.MH_Click.Count.2, MH.MH_First.Click.3, MH.MH_Last.Click.3, MH.MH_Page.Submit.3, MH.MH_Click.Count.3, PH.MH_First.Click, PH.MH_Last.Click, PH.MH_Page.Submit, PH.MH_Click.Count, PH.MH_First.Click.1, PH.MH_Last.Click.1, PH.MH_Page.Submit.1, PH.MH_Click.Count.1, PH.MH_First.Click.2, PH.MH_Last.Click.2, PH.MH_Page.Submit.2, PH.MH_Click.Count.2, PH.MH_First.Click.3, PH.MH_Last.Click.3, PH.MH_Page.Submit.3, PH.MH_Click.Count.3, BM.MH_First.Click, BM.MH_Last.Click, BM.MH_Page.Submit, BM.MH_Click.Count, BM.MH_First.Click.1, BM.MH_Last.Click.1, BM.MH_Page.Submit.1, BM.MH_Click.Count.1, BM.MH_First.Click.2, BM.MH_Last.Click.2, BM.MH_Page.Submit.2, BM.MH_Click.Count.2, BM.MH_First.Click.3, BM.MH_Last.Click.3, BM.MH_Page.Submit.3, BM.MH_Click.Count.3, MH.non.MH_First.Click, MH.non.MH_Last.Click, MH.non.MH_Page.Submit,  MH.non.MH_Click.Count, MH.non.MH_First.Click.1, MH.non.MH_Last.Click.1, MH.non.MH_Page.Submit.1, MH.non.MH_Click.Count.1, MH.non.MH_First.Click.2, MH.non.MH_Last.Click.2, MH.non.MH_Page.Submit.2, MH.non.MH_Click.Count.2, MH.non.MH_First.Click.3, MH.non.MH_Last.Click.3, MH.non.MH_Page.Submit.3, MH.non.MH_Click.Count.3, PH.non.MH_First.Click, PH.non.MH_Last.Click, PH.non.MH_Page.Submit, PH.non.MH_Click.Count, PH.non.MH_First.Click.1, PH.non.MH_Last.Click.1, PH.non.MH_Page.Submit.1, PH.non.MH_Click.Count.1, PH.non.MH_First.Click.2, PH.non.MH_Last.Click.2, PH.non.MH_Page.Submit.2, PH.non.MH_Click.Count.2, PH.non.MH_First.Click.3, PH.non.MH_Last.Click.3, PH.non.MH_Page.Submit.3, PH.non.MH_Click.Count.3, Blank.non.MH_First.Click, Blank.non.MH_Last.Click, Blank.non.MH_Page.Submit, Blank.non.MH_Click.Count, Blank.non.MH_First.Click.1, Blank.non.MH_Last.Click.1, Blank.non.MH_Page.Submit.1, Blank.non.MH_Click.Count.1, Blank.non.MH_First.Click.2, Blank.non.MH_Last.Click.2,  Blank.non.MH_Page.Submit.2, Blank.non.MH_Click.Count.2, Blank.non.MH_First.Click.3, Blank.non.MH_Last.Click.3, Blank.non.MH_Page.Submit.3, Blank.non.MH_Click.Count.3, FL_47_DO, X, SHAPS1, SHAPS2, SHAPS3, SHAPS4, SHAPS5, SHAPS6, SHAPS7, SHAPS8, SHAPS9, SHAPS10, SHAPS11, SHAPS12, SHAPS13, SHAPS14, Gender, Gender_other, Education, Ethnicity, ethnicity_other_white, ethnicity_other_black, ethnicity_other_asian, ethnicity_other_mixed, ethnicity_any_other))

#Check correct variables remain in data frame
names(data)

#View summary data for any out of range values among variables
summary(data)

#Histograms for SHAPS, PHQ9 and GAD7 to see distribution 
hist(data$SHAPS_total)
hist(data$PHQ9_total)
hist(data$GAD7_total)

#Recode participant condition as numeric variable as follows:
#PH non MH condition = 1
#PH MH condition = 2
#Blank non MH condition = 3
#Blank MH condition = 4
#MH non MH condition = 5
#MH MH condition  = 6
#No condition  = left as NA and omitted 

data$condition_n <- recode(data$condition, " PHMessagingnonMHgroup" = 1, "PHMessagingMHgroup " = 2, " BlankmessagingnonMHgroup" = 3, "BlankmessagingMHgroup " = 4, " MHMessagingnonMHgroup" = 5, "MHmessagingMHgroup " = 6, " " = 7)

#Check recoding of condition has worked and that recognised as numeric not character string 
summary(data$condition_n)
class(data$condition_n)

#Check spread of condition 
table (data$condition_n)

#explore removal of rows / participants with no condition - this works but bit clunky 
data[data$condition_n =="7", ]   #find row numbers with condition as '7'
#remove by row number from data frame
data <- data[-c(522, 524, 525, 526, 527, 531, 532, 533, 534, 538, 539, 540, 541, 542, 544, 545, 548, 549, 550, 553, 561, 564, 566,	567,	568,	570,	573,	577,	579,	585,	586,	595,	597,	600,	601,	603,	606,	608,	613,	616,	617,	619, 621, 624, 626, 627,	630, 637,	639, 644, 646, 647, 649, 650, 653, 657, 658, 661, 663, 664, 676, 685, 686, 688, 693, 695), ]

## code experimental condition with 3 levels 
data$experimental_con <- ifelse(data$condition_n==1 | data$condition_n==2, "Physical", NA)
data$experimental_con <- ifelse(data$condition_n==3 | data$condition_n==4, "Blank", data$experimental_con)
data$experimental_con <- ifelse(data$condition_n==5 | data$condition_n==6, "Mental", data$experimental_con)
#then to convert back to numbers
data$experimental_con <- recode(data$experimental_con, "Physical" = 1, "Blank" = 2, "Mental" = 3)

## code mental health condition with two levels (used condition variable above)
data$mental_health_status <- ifelse(data$condition_n==2 | data$condition_n==4 | data$condition_n==6, 1, 0)

###Calculate outcome variables###

##Calculate pre quitting self efficacy by mean of two items - doesnt work, issue with renaming variablea boce - "quitting efficacy"
data$pre_quit_self_efficacy <- rowMeans(data[, c("pre_quit_confidence", "Pre_cutdown_ease")]) 

##Calculate post quitting self efficacy by mean of two items

data$post_quit__self_efficacy <- rowMeans(data[, c("post_confidence_to_stop", "post_cut_down_ease")]) 

##Calculate pre smoking beliefs by mean of items

data$pre_smoking_beliefs <- rowMeans(data[, c("SB_relax", "SB_stress", "SB_weight", "SB_comfort", "SB_popular", "SB_no_harm")]) 

##Calculate post smoking beliefs by mean of items

data$post_smoking_beliefs <- rowMeans(data[, c("post_SB_relax", "post_SB_stress", "post_SB_weight", "post_SB_comfort", "post_SB_popular", "post_SB_no_harm")])

##Calculate Attention score by mean of two items 

data$Attention <-  rowMeans(data[, c("worth_remembering", "grabbed_attention")])

#check ranges etc of above 
summary(data$pre_quit_self_efficacy)
summary(data$post_quit__self_efficacy)
summary(data$pre_smoking_beliefs)
summary(data$post_smoking_beliefs)
summary(data$Attention)

table(data$experimental_con)
table(data$mental_health_status)


##STEP 3: Write cleaned dataset to new file and save as Positive_Messaging dataset_cleaned
write.csv(data, "Output/Data_cleaned/Positive_Messaging dataset_cleaned.csv")
